import pandas as pd
import numpy as np

individual = pd.read_csv('/REDACTED/data/final/individualBISG2014_full_final.csv')
individual_cbg_2014 = pd.read_csv("/REDACTED/BIFSG/geocodes_ty2014.csv")
cbg_race_prob = pd.read_stata("/REDACTED/race_cbg_probs_5yr_2014.dta")

state_codes = {
    'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
    'FL': '12', 'WY': '56', 'PR': '72', 'NJ': '34', 'NM': '35', 'TX': '48',
    'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
    'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
    'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
    'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
    'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
    'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
    'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46'
}


individual_cbg_2014.drop('recid', axis=1, inplace=True)                     
individual_cbg_2014 = individual_cbg_2014.loc[individual_cbg_2014['block'].astype(str).str.len() == 4]
individual_cbg_2014['fips'] = individual_cbg_2014['state'].map(state_codes)
individual_cbg_2014.drop_duplicates(inplace = True)
individual_cbg_2014.sort_values(['taxpayer_id', 'block', 'tract', 'countyFp', 'state'], ascending = [True, True, True, True, True], inplace = True)
individual_cbg_2014 = individual_cbg_2014.groupby('taxpayer_id').first().reset_index()
individual_cbg_2014['cbg'] = individual_cbg_2014['fips'] + individual_cbg_2014['countyFp'].astype(str).str.zfill(3) + individual_cbg_2014['tract'].astype(str).str.zfill(6) + individual_cbg_2014['block'].astype(str).str[0]


merged_first = individual.merge(individual_cbg_2014, on = ['taxpayer_id'], how = 'left')
merged_first['cbg'] = merged_first['cbg'].astype(np.float64)

merged_final = merged_first.merge(cbg_race_prob, on = ['cbg'], how = 'left')
merged_final.to_csv('/REDACTED/geo_only_fully_merged_test.csv')

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# create figures/tables

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
from matplotlib.patches import Patch
import matplotlib.lines as mlines
sys.path.insert(1,'/REDACTED/fairness/code/config')
from configureColors import *
sys.path.insert(1,'/REDACTED/fairness/code/utilities')
import weightedBinscatter as wb
import estimators as et
#sys.path.append('packages')
#import binscatter
import matplotlib.font_manager as fm
import matplotlib
import statsmodels.formula.api as smf
from statsmodels.iolib.smpickle import load_pickle
import joblib
from trajectoryPlotters import *

# set plot defaults
plt.style.use('/REDACTED/fairness/code/config/fairness.mplstyle')
fe = fm.FontEntry(
    fname='/REDACTED/fairness/code/config/fonts/cmunrm.ttf',
    name='latex')
fm.fontManager.ttflist.insert(0, fe)
matplotlib.rcParams['font.family'] = fe.name

colorsBNB = cdictBlackNonBlack()
colorsENE = cdictEITCNon()
cb = colorsBNB['Black']['color']
ab = colorsBNB['Black']['alpha']
cnb = colorsBNB['non-Black']['color']
anb = colorsBNB['non-Black']['alpha']
ce = colorsENE['eitc']['color']
cne = colorsENE['non']['color']
ae = colorsENE['eitc']['alpha']
ane = colorsENE['non']['alpha']

## Linear Estimator
def regEstimate(dataset, pbvarb = 'predicted_prob_black', outcome = 'audited'):
	model = sm.OLS.from_formula(outcome+' ~ '+pbvarb, data = dataset).fit(cov_type = 'HC1')
	coef = model.params[pbvarb]
	se = model.bse[pbvarb]
	black_aud_rate = model.params[0] + model.params[1]
	non_black_aud_rate = model.params[0] 
	return coef, se, black_aud_rate, non_black_aud_rate

## Probabilistic Estimator 
def chenEstimate(dataset, pbvarb = 'predicted_prob_black', outcome = 'audited'):
	black_aud_rate = (dataset[pbvarb]*dataset[outcome]).sum()/(dataset[pbvarb]).sum()
	non_black_aud_rate = ((1-dataset[pbvarb])*dataset[outcome]).sum()/(1-dataset[pbvarb]).sum()
	est =  black_aud_rate - non_black_aud_rate
	return est, black_aud_rate, non_black_aud_rate

## Get standard errors
def getSEs(dataset,pbvarb='predicted_prob_black',outcome='audited', seReg = 0.0):
	seMultiplier=getSEMultiplier(dataset,pbvarb)
	#seReg = regEstimate(dataset,pbvarb,outcome)[1]
	seChen = seReg*seMultiplier
	return seReg, seChen

def getSEMultiplier(dataset,pbvarb='predicted_prob_black'):
	return np.sqrt(dataset[pbvarb].var()/(dataset[pbvarb].mean()*(1-dataset[pbvarb].mean())))

### OUTPUT

out = '/REDACTED/'


#individual = pd.read_csv('/REDACTED/geo_only_fully_merged.csv')
#individual = individual[individual.race_by_cbg_black.notnull()]
individual = merged_final[merged_final.race_by_cbg_black.notnull()]
ncdata = pd.read_csv("/REDACTED/NC_Analysis_Dataset_2014_tplevel.csv")
### NC Weights
ncweights = pd.read_csv("/REDACTED/HertzGraff/nc_reweights_v4_April.csv")

nc_BISG = individual[individual['state_x']=="NC"]

ncdata = ncdata[ncdata.black_ind.notna()]
ncdata = ncdata[['taxpayer_id', 'black_ind']]

ncweights = ncweights[['taxpayer_id', 'uswgt']]


ncdata = pd.merge(ncdata, ncweights, on = 'taxpayer_id')
ncdata = pd.merge(ncdata, nc_BISG, on = 'taxpayer_id')


ncdata['unitwt'] = 1
ncdata['p_black_rd'] = ncdata['race_by_cbg_black'].round(2)
ncdata['p_black_rd'] = pd.to_numeric(ncdata['p_black_rd'])
ncdata['race_by_cbg_black'] = pd.to_numeric(ncdata['race_by_cbg_black'])
ncdata['black_ind'] = pd.to_numeric(ncdata['black_ind'])
ncdata['black_ind_wt'] = ncdata['black_ind'] * ncdata['uswgt']

grouped1 = ncdata.groupby(['p_black_rd','isEIC']).black_ind.mean().reset_index()
groupedwt1 = (ncdata.groupby(['p_black_rd','isEIC']).black_ind_wt.sum()/ncdata.groupby(['p_black_rd','isEIC']).uswgt.sum()).reset_index()
ovwt1 = (ncdata.groupby('p_black_rd').black_ind_wt.sum()/ncdata.groupby('p_black_rd').uswgt.sum()).reset_index()



~~~~~~~~~~~~

######################
##### Figure A.10 (left): Distribution and Calibration of Race Imputation
######################

# probBlack_hist_fw_big.png

individual['pBlack_bucket'] = pd.cut(individual.race_by_cbg_black,bins=[i/20 for i in range(22)],labels=[i/20 for i in range(21)], right=False,include_lowest=True).astype(float)
pct_black_counts = individual.groupby('pBlack_bucket').taxpayer_id.count().reset_index()
fig,ax=plt.subplots(figsize=(10,8))
plt.bar(pct_black_counts.pBlack_bucket,pct_black_counts.taxpayer_id/pct_black_counts.taxpayer_id.sum(),width=0.05)
plt.xlabel('Geography-Predicted Probability Black')
plt.ylabel('Population Share')
plt.savefig(out+'probBlack_hist_fw_big.png')
plt.close('all')


######################
##### Figure A.10 (right): Distribution and Calibration of Race Imputation
######################

# calibration_pblack_01_uswgt_big.png

fig,ax=plt.subplots(figsize=(10,8))
plt.plot(groupedwt1[groupedwt1.isEIC==0].p_black_rd,groupedwt1[groupedwt1.isEIC==0][0],label='Non-EITC Claimants',color=cne,alpha=ane)
plt.plot(groupedwt1[groupedwt1.isEIC==1].p_black_rd,groupedwt1[groupedwt1.isEIC==1][0],label='EITC Claimants',color=ce,alpha=ae)
plt.plot(ovwt1.p_black_rd,ovwt1[0],label='Overall')
plt.plot([0,1],[0,1],linestyle='--',color='black',linewidth=3)
plt.xlabel('Geography-Predicted Probability Black')
plt.ylabel('True Share Black')
plt.legend()
plt.savefig(out+'calibration_pblack_01_uswgt_big.png')
plt.close('all')

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


#########################
#### Fig A.11: Audit Rate by Predicted Race Conditional on Self-Reported Race
#########################

# auditrate_p_bs.png

ncdata['unitwt'] = 1
ncb = ncdata[ncdata.black_ind==1]
ncb = ncb.dropna(subset=['race_by_cbg_black'])
ncnb = ncdata[ncdata.black_ind==0]
ncnb = ncnb.dropna(subset=['race_by_cbg_black'])
fig,_ = wb.weightedBinscatterMulti([ncnb,ncb],xlab='Geography-Predicted Probability Black',datalabels=['Non-Black','Black'], x_col='race_by_cbg_black',y_col='aud_no_research_audits',ylab='Audit Rate (%)', w_col='unitwt',nbins=100,alphas=[anb,ab],reportCoefs=False,colors=[cnb,cb], best_lin_fit=True, scale_y=True, shapes=['o', 'x'])
b_lab = mlines.Line2D([],[], markeredgecolor = cb, alpha = ab, marker = 'x', linestyle = 'None', label = 'Black')
nb_lab = mlines.Line2D([],[],markeredgecolor = cnb, markerfacecolor = 'none', alpha = anb, marker = 'o', linestyle = 'None', label = 'Non-Black')
plt.legend(handles = [b_lab, nb_lab])
plt.savefig(out+'auditrate_p_bs_big.png')

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#####################################
############ Table A12 ###############
#####################################

import numpy as np
import pandas as pd
import statsmodels.api as sm
import math
import sys

## Linear Estimator
def regEstimate(dataset, pbvarb = 'predicted_prob_black', outcome = 'audited'):
        model = sm.OLS.from_formula(outcome+' ~ '+pbvarb, data = dataset).fit(cov_type = 'HC1')
        coef = model.params[pbvarb]
        se = model.bse[pbvarb]
        black_aud_rate = model.params[0] + model.params[1]
        non_black_aud_rate = model.params[0] 
        print("number of obs :" + str(model.nobs))
        return coef, se, black_aud_rate, non_black_aud_rate

## Probabilistic Estimator 
def chenEstimate(dataset, pbvarb = 'predicted_prob_black', outcome = 'audited'):
        black_aud_rate = (dataset[pbvarb]*dataset[outcome]).sum()/(dataset[pbvarb]).sum()
        non_black_aud_rate = ((1-dataset[pbvarb])*dataset[outcome]).sum()/(1-dataset[pbvarb]).sum()
        est =  black_aud_rate - non_black_aud_rate
        return est, black_aud_rate, non_black_aud_rate

## Get standard errors
def getSEs(dataset,pbvarb='predicted_prob_black',outcome='audited', seReg = 0.0):
        seMultiplier=getSEMultiplier(dataset,pbvarb)
        #seReg = regEstimate(dataset,pbvarb,outcome)[1]
        seChen = seReg*seMultiplier
        return seReg, seChen

def getSEMultiplier(dataset,pbvarb='predicted_prob_black'):
        return np.sqrt(dataset[pbvarb].var()/(dataset[pbvarb].mean()*(1-dataset[pbvarb].mean())))


dataBISG = individual


sys.stdout = open("/REDACTED/disparity_results_big.txt", "w")

linDisparity = regEstimate(dataBISG, pbvarb = 'race_by_cbg_black', outcome = 'aud_no_research_audits')
probDisparity = chenEstimate(dataBISG, pbvarb = 'race_by_cbg_black', outcome = 'aud_no_research_audits')
linSE, probSE = getSEs(dataBISG, pbvarb = 'race_by_cbg_black', outcome = 'aud_no_research_audits', seReg = float(linDisparity[1]))

full_pop_overall_lin = linDisparity[0]
full_pop_overall_prob = probDisparity[0]
full_pop_black_lin = linDisparity[2]
full_pop_black_prob = probDisparity[1]
full_pop_non_black_lin = linDisparity[3]
full_pop_non_black_prob = probDisparity[2]


eitc_pop = dataBISG.loc[dataBISG['isEIC'] == 1]

linDisparity_eic = regEstimate(eitc_pop, pbvarb = 'race_by_cbg_black', outcome = 'aud_no_research_audits')
probDisparity_eic = chenEstimate(eitc_pop, pbvarb = 'race_by_cbg_black', outcome = 'aud_no_research_audits')
linSE_eic, probSE_eic = getSEs(eitc_pop, pbvarb = 'race_by_cbg_black', outcome = 'aud_no_research_audits', seReg = float(linDisparity_eic[1]))

eitc_overall_lin = linDisparity_eic[0]
eitc_overall_prob = probDisparity_eic[0]
eitc_black_lin = linDisparity_eic[2]
eitc_black_prob = probDisparity_eic[1]
eitc_non_black_lin = linDisparity_eic[3]
eitc_non_black_prob = probDisparity_eic[2]


non_eitc_pop = dataBISG.loc[dataBISG['isEIC'] == 0]

linDisparity_non_eic = regEstimate(non_eitc_pop, pbvarb = 'race_by_cbg_black', outcome = 'aud_no_research_audits')
probDisparity_non_eic = chenEstimate(non_eitc_pop, pbvarb = 'race_by_cbg_black', outcome = 'aud_no_research_audits')
linSE_non_eic, probSE_non_eic = getSEs(non_eitc_pop, pbvarb = 'race_by_cbg_black', outcome = 'aud_no_research_audits', seReg = float(linDisparity_non_eic[1]))

non_eitc_overall_lin = linDisparity_non_eic[0]
non_eitc_overall_prob = probDisparity_non_eic[0]
non_eitc_black_lin = linDisparity_non_eic[2]
non_eitc_black_prob = probDisparity_non_eic[1]
non_eitc_non_black_lin = linDisparity_non_eic[3]
non_eitc_non_black_prob = probDisparity_non_eic[2]

print("Full population \n")
print("Overall Linear Disparity: "+str(full_pop_overall_lin))
print("Black Linear Disparity: "+str(full_pop_black_lin))
print("Non-Black Linear Disparity: "+str(full_pop_non_black_lin))
print("Linear Standard Error: "+str(linSE))
print("Overall Probabilistic Disparity: "+str(full_pop_overall_prob))
print("Black Probabilistic Disparity: "+str(full_pop_black_prob))
print("Non-Black Probabilistic Disparity: "+str(full_pop_non_black_prob))
print("Probabilistic Standard Error: "+str(probSE)+"\n")


print("EITC population \n")
print("Overall Linear Disparity: "+str(eitc_overall_lin))
print("Black Linear Disparity: "+str(eitc_black_lin))
print("Non-Black Linear Disparity: "+str(eitc_non_black_lin))
print("Linear Standard Error: "+str(linSE_eic))
print("Overall Probabilistic Disparity: "+str(eitc_overall_prob))
print("Black Probabilistic Disparity: "+str(eitc_black_prob))
print("Non-Black Probabilistic Disparity: "+str(eitc_non_black_prob))
print("Probabilistic Standard Error: "+str(probSE_eic)+"\n")


print("Non EITC population \n")
print("Overall Linear Disparity: "+str(non_eitc_overall_lin))
print("Black Linear Disparity: "+str(non_eitc_black_lin))
print("Non-Black Linear Disparity: "+str(non_eitc_non_black_lin))
print("Linear Standard Error: "+str(linSE_non_eic))
print("Overall Probabilistic Disparity: "+str(non_eitc_overall_prob))
print("Black Probabilistic Disparity: "+str(non_eitc_black_prob))
print("Non-Black Probabilistic Disparity: "+str(non_eitc_non_black_prob))
print("Probabilistic Standard Error: "+str(probSE_non_eic)+"\n")

sys.stdout = sys.__stdout__


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


######################################
########### Table A4 #################
######################################


varlist = ['Metrics', 'Full Pop Unweighted', 'Full Pop Weighted', 'EITC Unweighted', 'EITC Weighted', 'Non-EITC Unweighted', 'Non-EITC Weighted']
output = pd.DataFrame(columns=varlist)
output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights']
filters = [None, ~ncdata.taxpayer_id.isna(), ~ncdata.taxpayer_id.isna(), ncdata.isEIC==1, ncdata.isEIC==1, ncdata.isEIC==0, ncdata.isEIC==0]

for i in range(len(varlist)):
    if varlist[i] == 'Metrics':
        pass
    else:
        column = []
        dat = ncdata[filters[i]]
        if i%2 == 1:
            est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
            column.append(est*10000)
            column.append(se*10000)
            est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='race_by_cbg_black', conditioningvarb='black_ind')
            column.append(est*10000)
            column.append(se*10000)
            column.append(0)
            output[varlist[i]] = column
        elif i%2 == 0:
            est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='uswgt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
            column.append(est*10000)
            column.append(se*10000)
            est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='uswgt', outcomevarb='aud_no_research_audits', covarb='race_by_cbg_black', conditioningvarb='black_ind')
            column.append(est*10000)
            column.append(se*10000)
            column.append(1)
            output[varlist[i]] = column

output.loc[len(output)] = ["N", len(ncdata), len(ncdata), (ncdata.isEIC==1).sum(), (ncdata.isEIC==1).sum(), (ncdata.isEIC==0).sum(), (ncdata.isEIC==0).sum()]
output.to_latex('/REDACTED/residual_covariance_terms_SE_NC_big_V2.tex', index=False)